import pickle
import time
import os
import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
file_name = os.path.join('../../models/experiments/', 'multi-model-BayesSearchCV-2022-03-01-22-52-08.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = file_name)
results.best_score
0.7689418281963682
results.best_params
{'model': 'RandomForestClassifier()',
'max_features': 0.797397050836895,
'max_depth': 50,
'n_estimators': 539,
'min_samples_split': 29,
'min_samples_leaf': 4,
'max_samples': 0.6620299911421869,
'criterion': 'gini',
'imputer': "SimpleImputer(strategy='most_frequent')",
'scaler': 'None',
'pca': 'None',
'encoder': 'OneHotEncoder()'}
# Best model from each model-type.
df = results.to_formatted_dataframe(return_style=False, include_rank=True)
df["model_rank"] = df.groupby("model")["roc_auc Mean"].rank(method="first", ascending=False)
df.query('model_rank == 1')
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | model | C | max_features | max_depth | n_estimators | min_samples_split | min_samples_leaf | max_samples | criterion | imputer | scaler | pca | encoder | model_rank | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 1 | 0.769 | 0.723 | 0.814 | RandomForestClassifier() | NaN | 0.797397 | 50.0 | 539.0 | 29.0 | 4.0 | 0.66203 | gini | SimpleImputer(strategy='most_frequent') | None | None | OneHotEncoder() | 1.0 |
| 3 | 3 | 0.763 | 0.725 | 0.802 | LogisticRegression() | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | SimpleImputer() | StandardScaler() | None | OneHotEncoder() | 1.0 |
results.to_formatted_dataframe(return_style=True,
include_rank=True,
num_rows=1000)
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | model | C | max_features | max_depth | n_estimators | min_samples_split | min_samples_leaf | max_samples | criterion | imputer | scaler | pca | encoder |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.769 | 0.723 | 0.814 | RandomForestClassifier() | <NA> | 0.797 | 50.000 | 539.000 | 29.000 | 4.000 | 0.662 | gini | SimpleImputer(strategy='most_frequent') | None | None | OneHotEncoder() |
| 2 | 0.767 | 0.720 | 0.814 | RandomForestClassifier() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | None | OneHotEncoder() |
| 3 | 0.763 | 0.725 | 0.802 | LogisticRegression() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | StandardScaler() | None | OneHotEncoder() |
| 4 | 0.762 | 0.722 | 0.802 | RandomForestClassifier() | <NA> | 0.249 | 74.000 | 729.000 | 17.000 | 14.000 | 0.789 | gini | SimpleImputer(strategy='most_frequent') | None | PCA('mle') | CustomOrdinalEncoder() |
| 5 | 0.756 | 0.712 | 0.799 | RandomForestClassifier() | <NA> | 0.401 | 87.000 | 1,056.000 | 2.000 | 11.000 | 0.691 | gini | SimpleImputer() | None | PCA('mle') | CustomOrdinalEncoder() |
| 6 | 0.747 | 0.714 | 0.781 | LogisticRegression() | 32.731 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | StandardScaler() | PCA('mle') | OneHotEncoder() |
| 7 | 0.726 | 0.697 | 0.755 | LogisticRegression() | 0.000 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='most_frequent') | StandardScaler() | PCA('mle') | CustomOrdinalEncoder() |
| 8 | 0.713 | 0.672 | 0.753 | LogisticRegression() | 0.003 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | MinMaxScaler() | PCA('mle') | CustomOrdinalEncoder() |
results.to_formatted_dataframe(query='model == "RandomForestClassifier()"', include_rank=True)
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | max_features | max_depth | n_estimators | min_samples_split | min_samples_leaf | max_samples | criterion | imputer | pca | encoder |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.769 | 0.723 | 0.814 | 0.797 | 50.000 | 539.000 | 29.000 | 4.000 | 0.662 | gini | SimpleImputer(strategy='most_frequent') | None | OneHotEncoder() |
| 2 | 0.767 | 0.720 | 0.814 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | OneHotEncoder() |
| 3 | 0.762 | 0.722 | 0.802 | 0.249 | 74.000 | 729.000 | 17.000 | 14.000 | 0.789 | gini | SimpleImputer(strategy='most_frequent') | PCA('mle') | CustomOrdinalEncoder() |
| 4 | 0.756 | 0.712 | 0.799 | 0.401 | 87.000 | 1,056.000 | 2.000 | 11.000 | 0.691 | gini | SimpleImputer() | PCA('mle') | CustomOrdinalEncoder() |
results.to_formatted_dataframe(query='model == "LogisticRegression()"', include_rank=True)
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | C | imputer | scaler | pca | encoder |
|---|---|---|---|---|---|---|---|---|
| 1 | 0.763 | 0.725 | 0.802 | <NA> | SimpleImputer() | StandardScaler() | None | OneHotEncoder() |
| 2 | 0.747 | 0.714 | 0.781 | 32.731 | SimpleImputer() | StandardScaler() | PCA('mle') | OneHotEncoder() |
| 3 | 0.726 | 0.697 | 0.755 | 0.000 | SimpleImputer(strategy='most_frequent') | StandardScaler() | PCA('mle') | CustomOrdinalEncoder() |
| 4 | 0.713 | 0.672 | 0.753 | 0.003 | SimpleImputer(strategy='median') | MinMaxScaler() | PCA('mle') | CustomOrdinalEncoder() |
results.plot_performance_across_trials(facet_by='model').show()
results.plot_performance_across_trials(query='model == "RandomForestClassifier()"').show()
results.plot_parameter_values_across_trials(query='model == "RandomForestClassifier()"').show()
# results.plot_scatter_matrix(query='model == "RandomForestClassifier()"',
# height=1000, width=1000).show()
results.plot_performance_numeric_params(query='model == "RandomForestClassifier()"',
height=800)
results.plot_parallel_coordinates(query='model == "RandomForestClassifier()"').show()
results.plot_performance_non_numeric_params(query='model == "RandomForestClassifier()"').show()
results.plot_score_vs_parameter(
query='model == "RandomForestClassifier()"',
parameter='max_features',
size='max_depth',
color='encoder',
)
# results.plot_parameter_vs_parameter(
# query='model == "XGBClassifier()"',
# parameter_x='colsample_bytree',
# parameter_y='learning_rate',
# size='max_depth'
# )
# results.plot_parameter_vs_parameter(
# query='model == "XGBClassifier()"',
# parameter_x='colsample_bytree',
# parameter_y='learning_rate',
# size='imputer'
# )